In this example notebook, we will explore how we can use each tree boosting algorithm and how we can interpret the model using various techniques (i.e. shap and lime).
import pandas as pd
import numpy as np
import os, sys
pd.set_option('display.max_colwidth', 1000)
import warnings
warnings.filterwarnings("ignore")
# ML
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import catboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import recall_score, confusion_matrix, roc_auc_score
from sklearn.metrics import classification_report, roc_curve, precision_recall_curve
# Viz
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("white")
from IPython.display import display, HTML
# plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected = True)
# model interpretation
import shap
import lime
shap.initjs() # essential for visualization
SEED = 4895
PATH = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
os.chdir(PATH)
np.random.seed(SEED)
def missing_values_table(df):
"""
Function to explore how many missing values (NaN) in the dataframe against its size
@Args:
df: the input dataframe for analysis
Return:
mis_val_table_ren_columns: dataframe table contains the name of columns with missing data, # of missing values and % of missing against total
"""
mis_val = df.isnull().sum()
mis_val_percent = 100 * df.isnull().sum() / len(df)
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
mis_val_table_ren_columns = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})
mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
print("There are %s columns that have missing values.\n" % (mis_val_table_ren_columns.shape[0]))
if mis_val_table_ren_columns.shape[0] > 0:
print(mis_val_table_ren_columns.head(5))
return mis_val_table_ren_columns
def read_data(path:str='../data/'):
df = pd.read_csv(path)
df.columns = map(str.lower, df.columns)
print(df.shape)
display(df.head(3))
_ = missing_values_table(df)
return df
df = read_data('../data/Churn_Modelling.csv')
def visual_target_distribution(df:pd.DataFrame, targetColName:str):
"""Function to plot the target distribution"""
plt.figure(figsize=(13,6))
plt.subplot(121)
df[targetColName]\
.value_counts()\
.plot\
.pie(autopct="%1.1f%%", fontsize=14,\
wedgeprops={"linewidth": 2, "edgecolor": "w"})
plt.title("Target label percentages")
plt.ylabel("")
plt.subplot(122)
ax = sns.countplot(y=df[targetColName], linewidth=2,
edgecolor = "k"*df[targetColName].nunique())
for i, j in enumerate(df[targetColName].value_counts().values) :
ax.text(.1, i, j, fontsize=14, color="k")
plt.title("Count of target label")
plt.grid(True, alpha = .1)
plt.show()
return
idxCol, targetCol = 'customerid', 'exited'
visual_target_distribution(df, targetCol)
# rownumber is not useful
df.drop('rownumber', axis=1, inplace=True)
cols = [i for i in df.columns if (i not in [idxCol, targetCol])]
for col in cols:
print('Processing column name: "{}"'.format(col))
if df[col].dtypes == 'object':
if df[col].nunique() >= 10:
print('\tToo many unique values to plot')
print('\tPrint number of unique values instead: {}'.format(df[col].nunique()))
else:
ax = sns.countplot(y=df[col], linewidth=2,
edgecolor = "k"*df[col].nunique())
plt.show()
elif df[col].dtypes == 'int64':
if df[col].nunique() >= 5:
ax = sns.boxplot(x=df[col], orient='v', linewidth=1.5)
plt.show()
else:
ax = sns.countplot(y=df[col], linewidth=2,
edgecolor = "k"*df[col].nunique())
plt.show()
else:
ax = sns.boxplot(x=df[col], orient='v', linewidth=1.5)
plt.show()
print('\n')
df.groupby('surname').customerid.nunique().sort_values(ascending=False).head(10)
df.head()
Different people may share similar surname but are not related. Hence, we will remove this column due to this reason
df.drop('surname', axis=1, inplace=True)
transformed_df = pd.concat([
df, pd.get_dummies(df['geography'], prefix='country'), pd.get_dummies(df['gender'], prefix='gender')
], axis=1)
transformed_df.drop(['geography', 'gender', 'gender_Male', 'country_Spain'], axis=1, inplace=True)
transformed_df.head(2)
id_col = 'customerid'
target_col = 'exited'
def generate_train_test(df:pd.DataFrame,
test_size:float=0.2):
train_df, test_df = train_test_split(df, test_size=test_size, random_state=SEED)
cols = [i for i in df.columns if i not in id_col + target_col]
X_train = train_df[cols]
X_test = test_df[cols]
y_train = train_df[target_col]
y_test = test_df[target_col]
return X_train, y_train, X_test, y_test
def generate_feature_important_matrix(model, X_train:pd.DataFrame):
col_df = pd.DataFrame(X_train.columns)
coeff = pd.DataFrame(model.feature_importances_)
df = pd.merge(col_df, coeff, left_index=True, right_index=True, how='right')
df.columns = ['features', 'coefficients']
df.sort_values(by='coefficients', ascending=False, inplace=True)
return df
def print_model_evaluation(model, y_true, y_pred, y_prob):
print("\nClassification report: \n", classification_report(y_true, y_pred))
print("\nAccuracy Score: ", np.round(accuracy_score(y_true, y_pred), 4))
print("F1 Score: ", np.round(f1_score(y_true, y_pred), 4))
print("Area Under Curve: ", np.round(roc_auc_score(y_true, y_prob[:, 1]), 4), "\n")
return
def plot_model_evaluation(model, y_true, y_pred, y_prob,
X_train:pd.DataFrame,
figsize:tuple=(14, 18)):
fig = plt.figure(figsize=figsize)
fpr, tpr, _ = roc_curve(y_true, y_pred)
plt.subplot(321)
plt.plot(fpr, tpr, color='red', lw=1, alpha=.8,
label='{}'.format(str(model).split('(', 1)[0]))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([-0.1, 1.0])
plt.ylim([0.0, 1.0])
plt.legend()
plt.title("ROC Curve of the model ({:.2f})"\
.format(np.round(roc_auc_score(y_true, y_prob[:, 1]), 4)),
fontdict={'size': 16})
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.subplot(322)
lr_precision, lr_recall, _ = precision_recall_curve(y_true, y_prob[:, 1])
exit = len(y_true[y_true==1]) / len(y_true)
plt.plot([0, 1], [exit, exit], linestyle='--',
color='navy', alpha=.6, label='{} (random)'.format(target_col))
plt.plot(lr_recall, lr_precision, lw=1, alpha=.8,
color='red', label='{}'.format(str(model).split('(', 1)[0]))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('F1 score: {:.2f}'.format(f1_score(y_true, y_pred)),
fontdict={'size': 16})
plt.legend()
plt.subplot(312)
smry_df = generate_feature_important_matrix(model, X_train)
ax = sns.barplot(y="coefficients", x="features", data=smry_df, color="red", alpha=.3)
for p in ax.patches:
ax.annotate(format(p.get_height(), '.2f'),
(p.get_x()+p.get_width()/2.,
p.get_height()),
ha='center', va='center', xytext=(0, 3),
textcoords='offset points')
plt.xticks(rotation=30)
plt.xlabel('Features', fontdict={'size': 14})
plt.yticks([])
plt.ylabel(None)
plt.title("Feature Importances",
fontdict={'size': 16})
plt.subplot(313)
cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
cm_sum = np.sum(cm, axis=1, keepdims=True)
cm_perc = cm / cm_sum.astype(float) * 100
annot = np.empty_like(cm).astype(str)
nrows, ncols = cm.shape
for i in range(nrows):
for j in range(ncols):
c = cm[i, j]
p = cm_perc[i, j]
if i == j:
s = cm_sum[i]
annot[i, j] = '%.2f%% \n %d / %d' % (p, c, s)
elif c == 0:
annot[i, j] = ''
else:
annot[i, j] = '%.2f%%\n%d' % (p, c)
cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
cm.index.name = 'Actual'
cm.columns.name = 'Predicted'
sns.heatmap(cm, cmap= "RdYlGn", annot=annot,
fmt='', alpha=.7, cbar=False)
plt.subplots_adjust(hspace=.5)
plt.suptitle("Performance Report of {}".format(str(model).split('(', 1)[0]),
size=24)
plt.show()
return
X_train, y_train, X_test, y_test = generate_train_test(transformed_df, test_size=.25)
rf_model = RandomForestClassifier(n_estimators=100, random_state=SEED)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)
print_model_evaluation(model=rf_model, y_true=y_test, y_pred=y_pred, y_prob=y_prob)
plot_model_evaluation(rf_model, y_test, y_pred, y_prob, X_train)
X_train, y_train, X_test, y_test = generate_train_test(transformed_df, test_size=.25)
xgb_model = xgb.XGBClassifier(random_state=SEED)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)
print_model_evaluation(model=xgb_model, y_true=y_test, y_pred=y_pred, y_prob=y_prob)
plot_model_evaluation(xgb_model, y_test, y_pred, y_prob, X_train)
X_train, y_train, X_test, y_test = generate_train_test(df, test_size=.25)
cat_model = catboost.CatBoostClassifier(random_state=SEED, verbose=False)
cat_model.fit(X_train, y_train, cat_features=[1, 2])
y_pred = cat_model.predict(X_test)
y_prob = cat_model.predict_proba(X_test)
print_model_evaluation(model=cat_model, y_true=y_test, y_pred=y_pred, y_prob=y_prob)
plot_model_evaluation(cat_model, y_test, y_pred, y_prob, X_train)
X_train, y_train, X_test, y_test = generate_train_test(transformed_df, test_size=.25)
lgb_model = lgb.LGBMClassifier(random_state=SEED)
lgb_model.fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
y_prob = lgb_model.predict_proba(X_test)
print_model_evaluation(model=lgb_model, y_true=y_test, y_pred=y_pred, y_prob=y_prob)
plot_model_evaluation(lgb_model, y_test, y_pred, y_prob, X_train)
In the above section, I have provided the feature importance plot along with each model algorithm. In short, feature importance plot shows the ranking and resulting values (importance).
However, when we use the resulting values, we need to understand what is the value represented. There are several calculation / types:
1. Gain: implies the relative contribution to the model calculated by using each feature's contribution for each tree in the model. The higher the value, more important of the feature for predicting the result.
2. Coverage: relative number of observations related to each feature (feature x used to decide the leaf node for a, b, and c observations; a+b+c), and expressed as a percentage for all features' cover metrics.
3. Weight: is the percentage representing the relative number of times the feature occurs in the trees of the model, and expressed as its percentage weight over weights of all features.
Generally, we will focus on gain of each feature.
For interpretation, there are 2 main classes: local and global.
Local interpretability: provide detailed explanations of how each individual prediction was made. This will help the users to trust our model and understand how the model / recommendations are being made.
Global interpretability: provide overall understanding of the structure of the model, how the model works in general. This is more important for senior or sponsors needing to understand the model at high level.
There are two main ways to look at a classification or a regression model:
eli5.show_weights())eli5.show_prediction())import eli5
eli5 can show the weight which is similar to LGB feature importance plot
eli5.show_weights(lgb_model)
One thing we can do with eli5 is permutation importance (or Mean Decrease Accuracy, MDA).
The idea is that the feature importance can be measured by looking at how much the score (which can be accuracy, f1, etc) decreases when a feature is not available. However, to avoid the re-training of the model, it does not remove the feature but replace it with random noise. This workds if noise is drawn from the same distribution as original feature values. The simplest way to get such noise is to shuffle values for a feature (i.e. use other examples' feature values).
It can be resource-intensive if the number of columns are huge.
To summarize, the process of permutation is as follows:
To interpret the permutation importances, the values toward the top are the most important features, and those toward the bottom matter least.
The output weight are in the following form x±y, the first number shows how much model performance decreased with a random shuffling. The ± sign shows some randomness to the exact performance change from a shuffling on the feature. This randomness is measured by repeating the shuffling process.
Occasionally, the permutation importance will give the negative results. Hences, the predictions on the shuffled data are eventually more accurate than the real data. This happends when the feature(s) didn't matter (the actual importance is close to 0) and random chance caused the predictions on the shuffled data to be more accurate.
But it doesn't tell you how each features matter. If a feature has medium permutation importance, that could mean it has
perm = eli5.sklearn.PermutationImportance(lgb_model, random_state=SEED).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = X_train.columns.tolist())
perm = eli5.sklearn.PermutationImportance(lgb_model, random_state=SEED).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())
eli5 can also do the local explanation, that is explained the individual prediction of the black box model. It explains each predictions by showing feature weights, which are calculated by following decision paths of each tree in the ensemble. Each leaf has an output score, and expected scores can also be assigned to parent nodes. Contribution of each feature on the decision path is how much expected score changes from parent to child. Weights of all features sum to the output score of the model.
Below are the example.
eli5.explain_prediction_lightgbm(lgb_model, X_test.iloc[1])
eli5.explain_prediction_lightgbm(lgb_model, X_test.iloc[4])
The goal is to visualize the impact of certain features towards model prediction for any supervised learning algorithm using partial dependence plots.
This is very useful and can provide additional insights to the questions like;
from pdpbox import pdp, get_dataset, info_plots
Target plots
Plot average target values across the different feature values (feature grids)
fig, axes, _smry_df = info_plots.target_plot(df=transformed_df,
feature="numofproducts",
feature_name="numofproducts",
target="exited")
fig, axes, _smry_df = info_plots.target_plot(df=transformed_df,
feature="gender_Female",
feature_name="Gender",
target="exited")
_ = axes['bar_ax'].set_xticklabels(["Male", "Female"])
Check the prediction distribution through feature
fig, axes, _smry_df = info_plots.actual_plot(model=lgb_model, X=transformed_df[X_train.columns], feature="numofproducts",
feature_name="numofproducts", predict_kwds={})
Now let's see the PDP for top features.
pdp_goals = pdp.pdp_isolate(model=lgb_model, dataset=X_test,
model_features=X_test.columns, feature='age')
pdp.pdp_plot(pdp_goals, 'Ages')
plt.show()
fig, axes = pdp.pdp_plot(pdp_goals, 'Ages', center=True, plot_lines=True,
x_quantile=True, show_percentile=True,
frac_to_plot=100, plot_pts_dist=True)
plt.show()
pdp_goals = pdp.pdp_isolate(model=lgb_model, dataset=X_test,
model_features=X_test.columns, feature='numofproducts')
pdp.pdp_plot(pdp_goals, '# of products')
plt.show()
fig, axes = pdp.pdp_plot(pdp_goals, 'numofproducts', center=True, plot_lines=True, frac_to_plot=100, plot_pts_dist=True)
2D Partial Dependence Plots
If you are curious about interactions between features, 2D partial dependence plots are also useful. An example may clarify this.
inter1 = pdp.pdp_interact(model=lgb_model,
dataset=X_test, model_features=X_test.columns,
features=['age', 'numofproducts'])
pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=['age', 'numofproducts'],
plot_type='grid', x_quantile=True, plot_pdp=False)
plt.show()
Disadvantage: The assumption of independence is the biggest issue with PD plots. It is assumed that the feature(s) for which the partial dependence is computed are not correlated with other features.
Several Disadvantages
explainer = shap.TreeExplainer(lgb_model)
X_train, y_train, X_test, y_test = generate_train_test(transformed_df, test_size=.25)
shap_values = explainer.shap_values(X_test) # compute the Shapley values
Features with large absolute Shapley values are important (global importance). Permutation feature importance is based on the decrease in model performance. SHAP is based on magnitude of feature attributions.
shap.summary_plot(shap_values, X_test, plot_type="bar")
SHAP Summary Plot
Shapley value for a feature and an instance.shap.summary_plot(shap_values, X_test)
SHAP Dependence Plot
dependece_plot will automatically selects another feature for coloringshap.dependence_plot("age", shap_values, X_test, interaction_index='auto', alpha=.7)
shap.dependence_plot("age", shap_values, X_test,
interaction_index=None, alpha=.7)
Or we can turn the color off by using interaction_index=None
shap.dependence_plot("numofproducts", shap_values, X_test, interaction_index=None)
Local interpretation
force_plot to demonstrate how each SHAP value influences each individual predictionshap.force_plot(explainer.expected_value, shap_values[1], X_test.iloc[[1]])
We train the interpretation model by using command below:
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,
feature_names=X_train.columns.values.tolist(),
class_names=['non_exited', 'exited'],
verbose=True,
mode='classification'
)
print(lgb_model.predict_proba(X_test.iloc[[1, 2, 46]]))
print(y_test.iloc[[1, 2, 46]])
Remark ::
exp.as_map() * scaled instance + exp.intercept
expLgb = lime_explainer.explain_instance(data_row=X_test.values[1], predict_fn=lgb_model.predict_proba,
num_features=11)
expLgb.show_in_notebook(show_table=True)
expLgb.as_list()
expLgb = lime_explainer.explain_instance(X_test.values[2], lgb_model.predict_proba, num_features=5)
expLgb.show_in_notebook(show_table=True)
expLgb = lime_explainer.explain_instance(X_test.values[46], lgb_model.predict_proba, num_features=5)
expLgb.show_in_notebook(show_table=True)